library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1 <U+2713> purrr 0.3.3
## <U+2713> tibble 2.1.3 <U+2713> dplyr 0.8.3
## <U+2713> tidyr 1.0.0 <U+2713> stringr 1.4.0
## <U+2713> readr 1.3.1 <U+2713> forcats 0.4.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
drivers <- feather::read_feather("drivers.feather")
incidents <- feather::read_feather("incidents.feather")
non_motorists <- feather::read_feather("non_motorists.feather")
combo <- feather::read_feather("combo.feather")
incident_locations <- incidents %>%
filter(!not_in_county) %>%
select(longitude, latitude)
kmeans_list <- lapply(1:15, function(x) kmeans(incident_locations, centers = x, nstart = 25, iter.max = 100))
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)
centroidss <- data.frame(
centroids = 1:15,
sum_of_squares = sapply(1:15, function(x) kmeans_list[[x]]$tot.withinss)
)
centroidss %>%
ggplot(aes(x = centroids, y = sum_of_squares)) +
geom_line()

centroidss %>% knitr::kable()
| 1 |
819.73958 |
| 2 |
310.00688 |
| 3 |
216.66009 |
| 4 |
174.94499 |
| 5 |
142.10053 |
| 6 |
119.37358 |
| 7 |
101.20404 |
| 8 |
88.70035 |
| 9 |
76.81700 |
| 10 |
68.04903 |
| 11 |
62.61545 |
| 12 |
56.46295 |
| 13 |
52.08254 |
| 14 |
48.25165 |
| 15 |
45.18553 |
incident_clusters <- incident_locations
for (x in 1:15) {
incident_clusters$cluster <-
as.factor(paste0("cluster", kmeans_list[[x]]$cluster))
temp <- incident_clusters %>%
ggplot(aes(x = longitude, y = latitude, color = cluster)) +
geom_point()
temp %>% print()
}















contingency_table <- function(data, var1, var2) {
data %>%
group_by(!!enquo(var1), !!enquo(var2)) %>%
tally() %>%
ggplot() +
geom_bin2d(aes(x = !!enquo(var1), fill = log(n), y = !!enquo(var2))) +
geom_text(aes(x = !!enquo(var1), label = n, y = !!enquo(var2))) +
scale_fill_gradient(
low = "#ece7f2",
high = "#a6bddb"
) +
theme(
panel.grid = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white")
)
}
temp <- chisq.test(as.factor(incidents$weather), as.factor(incidents$acrs_report_type))
## Warning in chisq.test(as.factor(incidents$weather),
## as.factor(incidents$acrs_report_type)): Chi-squared approximation may be
## incorrect
temp %>%
broom::tidy() %>%
knitr::kable()
| 172.3132 |
0 |
24 |
Pearson’s Chi-squared test |
p-value 2.129059810^{-24}
contingency_table(incidents, acrs_report_type, weather)

Helmets
helmet_data <- non_motorists %>%
filter(as.logical(pedestrian_type_bicyclist)) %>%
select(safety_equipment, injury_severity) %>%
mutate(safety_equipment = c("no helmet", "helmet")[1 + as.integer(str_detect(tolower(safety_equipment), "helmet"))])
temp <- chisq.test(as.factor(helmet_data$safety_equipment), as.factor(helmet_data$injury_severity))
## Warning in chisq.test(as.factor(helmet_data$safety_equipment),
## as.factor(helmet_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
broom::tidy() %>%
knitr::kable()
| 7.961767 |
0.0929888 |
4 |
Pearson’s Chi-squared test |
contingency_table(helmet_data, safety_equipment, injury_severity)

Speed Limits
ordered_injury <- drivers %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
speed_limit_data <- drivers %>%
select(speed_limit, injury_severity) %>%
filter(speed_limit > 20 & speed_limit < 60) %>%
mutate(
`Speed Limit` = factor(speed_limit),
`Injury Severity` = factor(injury_severity, ordered_injury)
)
temp <- chisq.test(as.factor(speed_limit_data$speed_limit), as.factor(speed_limit_data$injury_severity))
## Warning in chisq.test(as.factor(speed_limit_data$speed_limit),
## as.factor(speed_limit_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
broom::tidy() %>%
knitr::kable()
| 669.7539 |
0 |
24 |
Pearson’s Chi-squared test |
p-value 5.647136710^{-126}
contingency_table(speed_limit_data, `Speed Limit`, `Injury Severity`)

ordered_injury <- drivers %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
speed_limit_data <- drivers %>%
select(speed_limit, injury_severity) %>%
mutate(
speed_limit = factor(c("Under 35", "35 - 45", "50+")[1 + findInterval(speed_limit, c(31, 46))], levels = c("Under 35", "35 - 45", "50+")),
injury_severity = factor(injury_severity, ordered_injury)
)
temp <- chisq.test(as.factor(speed_limit_data$speed_limit), as.factor(speed_limit_data$injury_severity))
## Warning in chisq.test(as.factor(speed_limit_data$speed_limit),
## as.factor(speed_limit_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
broom::tidy() %>%
knitr::kable()
| 1000.369 |
0 |
8 |
Pearson’s Chi-squared test |
p-value 1.242907610^{-210}
contingency_table(speed_limit_data, speed_limit, injury_severity)

temp <- combo %>%
filter(
road_division %in% c(
"TWO-WAY, NOT DIVIDED WITH A CONTINUOUS LEFT TURN",
"ONE-WAY TRAFFICWAY", "TWO-WAY, DIVIDED",
"UNPROTECTED PAINTED MIN 4 FEET",
"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",
"TWO-WAY, NOT DIVIDED"
)
) %>%
mutate(
severity_index =
d_vehicle_damage_extent_functional +
2 * d_vehicle_damage_extent_superficial +
4 * d_vehicle_damage_extent_disabling +
6 * d_vehicle_damage_extent_destroyed +
10 * d_injury_severity_fatal_injury +
8 * d_injury_severity_suspected_serious_injury +
4 * d_injury_severity_suspected_minor_injury +
2 * d_injury_severity_possible_injury +
10 * nm_injury_severity_fatal_injury +
8 * nm_injury_severity_suspected_serious_injury +
4 * nm_injury_severity_suspected_minor_injury +
2 * nm_injury_severity_possible_injury,
road_division = as.factor(road_division)
)
division_test_anova <- aov(severity_index ~ road_division, data = temp)
broom::tidy(division_test_anova) %>% rmarkdown::paged_table()
#division_test_kruskal <- kruskal.test(severity_index ~ road_division, data = temp)
plot(division_test_anova, 1)

ggplot(temp) + geom_density(aes(x = severity_index))

qqnorm(temp$severity_index)

plot(division_test_anova, 2)

division_test_oneway <- oneway.test(severity_index ~ road_division, data = temp)
broom::tidy(division_test_oneway) %>% rmarkdown::paged_table()
## Multiple parameters; naming those columns num.df, denom.df
division_test_pairwiset <- pairwise.t.test(temp$severity_index, temp$road_division, p.adjust.method = "BH", pool.sd = FALSE)
broom::tidy(division_test_pairwiset) %>% rmarkdown::paged_table()